import re


if __name__ == '__main__':
    file = r'/home/qinghua-user2/project_gaofie/language_model_datasets/original_datasets/mdb/kbcorpus.txt'
    with open(file, 'r', encoding='utf-8') as fr:
        res = re.findall(r'[^\u4e00-\u9fa5]', fr.read())
        print(set(res))
        fr.seek(0)
        res_char = re.findall(r'[\u4e00-\u9fa5]', fr.read())
        res_char_set = set(res_char)
        print(set(res_char))
        print(len(res_char_set))